When we talked about the neural attention mechansim we tried to learn the reverse function on variable length strings, now we will focus on what the encoder actually learned in order to generalize the reverse function.
1st we argue that in order to learn the reverse function our encoder needs to learn how to count the index of each input charcter, in order to test our assumpution we will do it by testing if the training of the network help us to predict the step of each encoder output.
%matplotlib inline
from models import AttentionNetwork, EncoderDecoderNetwork
from sklearn import linear_model
import numpy as np
import dynet as dy
import data
from collections import defaultdict
from random import shuffle
from copy import copy
class AttentionNetworkPredictEncoder(EncoderDecoderNetwork):
def __init__(self, enc_layers, dec_layers, embeddings_size, enc_state_size, dec_state_size):
EncoderDecoderNetwork.__init__(self, enc_layers, dec_layers, embeddings_size, enc_state_size, dec_state_size)
self.encoder_outputs = defaultdict(list)
self.should_save = False
def _encode_string(self, embedded_string):
initial_state = self.ENC_RNN.initial_state()
# run_rnn returns all the hidden state of all the slices of the RNN
hidden_states = self._run_rnn(initial_state, embedded_string)
#save the encoder outputs by their step number if needed
if self.should_save:
for i, vec in enumerate(hidden_states):
self.encoder_outputs[i].append(vec.npvalue())
return hidden_states
def _create_dataset(self, encoder_outputs):
dataset = []
for step in self.encoder_outputs:
vecs = self.encoder_outputs[step]
for vec in vecs:
dataset.append((vec, step))
shuffle(dataset)
cut = int(len(dataset)*9/10)
train = dataset[:cut]
val = dataset[cut:]
return train, val
def predict_encoder_step(self, input_strings):
self.should_save = True
#generate outputs over input strings
for input_string in input_strings:
self.generate(input_string)
self.should_save = False
#create encoded character to step dataset
train, val = self._create_dataset(self.encoder_outputs)
data, target = list(zip(*train))
val_data, val_target = zip(*val)
#predict the step using linear model
clf = linear_model.LassoLars(alpha=0.01)
data = np.array(data)
target = np.array(target)
y_pred = clf.fit(data, target).predict(val_data)
error = []
for i in range(len(y_pred)):
error.append(np.abs(val_target[i] - y_pred[i]))
self.encoder_outputs = defaultdict(list)
return np.mean(error)
Error on an untrained model:
ENC_RNN_NUM_OF_LAYERS = 1
DEC_RNN_NUM_OF_LAYERS = 1
EMBEDDINGS_SIZE = 4
ENC_STATE_SIZE = 32
DEC_STATE_SIZE = 32
long_sequences = [data.sample_model(14, 15)[0] for _ in range(2000)]
att = AttentionNetworkPredictEncoder(
ENC_RNN_NUM_OF_LAYERS, DEC_RNN_NUM_OF_LAYERS, EMBEDDINGS_SIZE, ENC_STATE_SIZE, DEC_STATE_SIZE)
print(att.predict_encoder_step(long_sequences))
Error on trained model:
def train(network, train_set, val_set, epochs=10):
def get_val_set_loss(network, val_set):
loss = [network.get_loss(input_string, output_string).value() for input_string, output_string in val_set]
return sum(loss)
train_set = train_set * epochs
trainer = dy.SimpleSGDTrainer(network.model)
for i, training_example in enumerate(train_set):
input_string, output_string = training_example
loss = network.get_loss(input_string, output_string)
loss.value()
loss.backward()
trainer.update()
val_loss = get_val_set_loss(network, val_set)
return val_loss
train(att, data.train_set, data.val_set)
print(att.predict_encoder_step(long_sequences))
Let's if we train our network on a task that not require a learning the order of each character. We will use the sort function:
from random import choice, randrange
def sample_model_sort(min_length, max_lenth):
random_length = randrange(min_length, max_lenth) # Pick a random length
random_char_list = [choice(data.characters[:-1]) for _ in range(random_length)] # Pick random chars
random_string = ''.join(random_char_list)
random_char_list.sort()
sorted_string = ''.join(random_char_list)
return random_string, sorted_string # Return the random string and its reverse
train_set = [sample_model_sort(1, data.MAX_STRING_LEN) for _ in range(3000)]
val_set = [sample_model_sort(1, data.MAX_STRING_LEN) for _ in range(50)]
long_sequences = [data.sample_model(14, 15)[0] for _ in range(2000)]
att = AttentionNetworkPredictEncoder(
ENC_RNN_NUM_OF_LAYERS, DEC_RNN_NUM_OF_LAYERS, EMBEDDINGS_SIZE, ENC_STATE_SIZE, DEC_STATE_SIZE)
print(att.predict_encoder_step(long_sequences))
As we can see an encoder trained on a task that requiers knowing the charcter index can predict the index much better.
Another assumption about the encoder is that he learns the distribution of the input. we will test it by seeing if input generated by a simpler distribution can be learned by a simpler encoder.
Let's see how the encoder size effects our output:
import matplotlib.pyplot as plt
def plot_preformence_by_encoder_size(Model, train_set, val_set):
ENC_RNN_NUM_OF_LAYERS = 1
DEC_RNN_NUM_OF_LAYERS = 1
EMBEDDINGS_SIZE = 4
iterations = []
losses = []
for i in range(1, 32):
model = Model(ENC_RNN_NUM_OF_LAYERS, DEC_RNN_NUM_OF_LAYERS, EMBEDDINGS_SIZE, i, i)
loss = train(model, train_set, val_set)
iterations.append(i)
losses.append(loss)
plt.plot(iterations, losses)
plt.axis([4, 32, 0, len(data.val_set) * data.MAX_STRING_LEN])
plt.show()
plot_preformence_by_encoder_size(AttentionNetwork, data.train_set, data.val_set)
characters = list("abcd")
characters.append(data.EOS)
def sample_model(min_length, max_lenth):
random_length = randrange(min_length, max_lenth)
random_char_list = []
for i in range(random_length):
if i%2 == 0:
random_char_list.append(choice(data.characters[:-1][0::2]))
else:
random_char_list.append(choice(data.characters[:-1][1::2]))
random_string = ''.join(random_char_list)
return random_string, random_string[::-1]
train_set = [sample_model(1, data.MAX_STRING_LEN) for _ in range(3000)]
val_set = [sample_model(1, data.MAX_STRING_LEN) for _ in range(50)]
plot_preformence_by_encoder_size(AttentionNetwork, train_set, val_set)